import datetime

from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import MathpixPDFLoader


def pdf_analyse_py(pdf_file):
    # 加载pdf 耗时近3m
    start = datetime.datetime.now()
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()
    cost = datetime.datetime.now() - start
    print(f"耗时:{cost}")
    print(len(pages))
    for page in pages:
        print(page)


def pdf_analyse_math_pix(pdf_file_path):
    start = datetime.datetime.now()
    loader = MathpixPDFLoader(pdf_file_path)
    pages = loader.load()
    cost = datetime.datetime.now() - start
    print(f"耗时:{cost}")
    print(len(pages))
    for page in pages:
        print(page)


pdf_path_sea_ship = '../pdf/钢质海船入级规范2022.pdf'
# pdf_analyse_py(pdf_path_sea_ship)
pdf_analyse_math_pix(pdf_path_sea_ship)
